emanuela.damieto@studenti.unitn.it
This section and the next are relevant for reproducibility purposes. For results, please skip to section 3 (Quality Control)
suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(magrittr)
library(parallel)
library(pheatmap)
library(plotly)
library(pvclust)
library(tidyverse)
library(tximport)
library(vsn)
library(emoji)
})source(here("UPSCb-common/src/R/featureSelection.R"))hpal <- colorRampPalette(c("blue","white","red"))(100)samples <- read_csv(here("data/drought_roots.csv"),
col_types=cols(.default=col_factor()))tx2gene <- suppressMessages(read_delim(here("reference/annotation/Picab02_tx2gene.tsv.gz"), delim="\t", col_names=c("TXID","GENE"), skip=1))filelist <- list.files(here("results/Salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)stopifnot(all(match(sub("[1-3]_151118_BC852HANXX_P2503_", "", sub("*_sortmerna_trimmomatic","",basename(dirname(filelist)))),
samples$SciLifeID) == 1:nrow(samples)))
samples_rep <-rbind(samples,samples)
samples_rep <- rbind(samples_rep,samples)samples_rep %<>% mutate(Filenames = filelist)
samples_rep$BioID <- sub("[1-3]_151118_BC852HANXX_", "", sub("*_sortmerna_trimmomatic","",basename(dirname(samples_rep$Filenames))))txi <- suppressMessages(tximport(files = samples_rep$Filenames,
type = "salmon",
tx2gene=tx2gene))
counts <- txi$counts
counts <- do.call(
cbind,
lapply(split.data.frame(t(counts),
samples$SciLifeID),
colSums))
csamples <- samples[,-1]
csamples <- csamples[match(colnames(counts),csamples$SciLifeID),]
colnames(counts) <- csamples$SciLifeID
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))## [1] "8.3% percent (3627) of 43909 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(csamples)
ggplot(dat,aes(x,y,fill=Level)) +
geom_col() +
scale_y_continuous(name="reads") +
facet_grid(~ factor(Level), scales = "free") +
theme_bw() +
theme(axis.text.x=element_text(angle=90,size=4), axis.title.x=element_blank()) +
labs(title ="Sample sequencing depth")👉 There is not a big difference in the raw sequencing depth
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density(na.rm = TRUE) +
ggtitle("Gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)") +
theme_bw()👉 The per-gene mean expression is as expected since the highest peak is around 2
dat <- as.data.frame(log10(counts)) %>%
utils::stack() %>%
mutate(Level=samples_rep$Level[match(ind,csamples$SciLifeID)])
ggplot(dat,aes(x=values,group=ind,col=Level)) +
geom_density(na.rm = TRUE) +
ggtitle("Sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)") +
theme_bw()👉 All samples have the same sequencing depth characteristics and there is no deviation when we look at one or the other condition
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data_merge.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample condition and replicate.
load(file=here("data/analysis/salmon/dds_merge.rda"))(i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds) %>%
suppressMessages()boxplot of the sequencing libraries size factor:
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y")
abline(h=1, col = "Red", lty = 3)and without outliers:
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y", outline=FALSE)
abline(h=1, col = "Red", lty = 3)👉 The sequencing libraries size factor is a bit variable across samples
Let’s look at standard deviations before and after VST normalization. This plot is to see whether there is a dependency of SD on the mean.
Before:
meanSdPlot(log1p(counts(dds))[rowSums(counts(dds))>0,])After VST normalization, the red line should be almost horizontal which indicates no dependency of variance on mean (homoscedastic).
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
meanSdPlot(vst[rowSums(vst)>0,])👉 We can conclude that the variance stabilization worked adequately even if the red line is not perfectly horizontal
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)We define the number of variables of the model (=1) and the number of possible combinations (=8).
We plot the percentage explained by different components
nvar=1
nlevel=nlevels(dds$Level)
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)+
ggtitle("Percentage of variance explained by different components")## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## Warning: Removed 2 rows containing missing values (`geom_line()`).
dds$Filenames <- sub("*_151118_BC852HANXX_P2503_", "_", sub("*_sortmerna_trimmomatic","",basename(dirname(dds$Filenames))))
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
as.data.frame(colData(dds)))
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Level,shape=Level,text=Filenames)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 8. Consider
## specifying shapes manually if you must have them.
👉 Some conditions clusterized better than others in the PCA plot
Number of genes expressed per condition at different cutoffs:
conds <- factor(dds$Level)
dev.null <- rangeSamplesSummary(counts=vst,
conditions=conds,
nrep=3)Filter for noise
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3) %>%
suppressWarnings()vst.cutoff <- 2
abline(h=10000, col="Red", lty=3)vst.cutoff <- 6
hm_reduced <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)plot(as.hclust(hm_reduced$colDendrogram),xlab="",sub="")mat <- t(scale(t(vst[sels[[vst.cutoff+1]],])))
hm <- pheatmap(mat,
color = hpal,
border_color = NA,
clustering_distance_rows = "correlation",
clustering_method = "ward.D2",
show_rownames = F,
labels_col = conds,
angle_col = 90,
legend = F)plot(as.hclust(hm_reduced$colDendrogram),xlab="",sub="", cex.axis=2)👉 The different conditions are not so different in gene expression level except the extreme conditions
Done to assess the previous dendrogram’s reproducibility
hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
method.hclust = "ward.D2",
nboot = 1000, parallel = TRUE)## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.
Plot the clustering with bp and au
plot(hm.pvclust, labels = conds, cex.axis=1.5)
pvrect(hm.pvclust)👉 Some conditions clusterize better than others
print(hm.pvclust, digits=3)##
## Cluster method: ward.D2
## Distance : correlation
##
## Estimates on edges:
##
## si au bp se.si se.au se.bp v c pchi
## 1 0.975 0.987 0.991 0.014 0.008 0.001 -2.291 -0.073 0.294
## 2 0.857 0.938 0.886 0.022 0.012 0.003 -1.371 0.167 0.350
## 3 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 4 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 5 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 6 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 7 0.979 0.988 0.994 0.013 0.008 0.001 -2.392 -0.128 0.705
## 8 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 9 0.976 0.987 0.994 0.014 0.009 0.001 -2.369 -0.156 0.644
## 10 0.031 0.495 0.537 0.032 0.031 0.005 -0.040 -0.053 0.055
## 11 0.441 0.709 0.744 0.038 0.028 0.005 -0.602 -0.053 0.038
## 12 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 13 0.951 0.967 0.996 0.035 0.026 0.001 -2.254 -0.410 0.419
## 14 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 15 0.565 0.832 0.649 0.035 0.020 0.005 -0.673 0.290 0.283
## 16 0.236 0.763 0.392 0.046 0.024 0.005 -0.221 0.496 0.070
## 17 0.193 0.725 0.412 0.045 0.026 0.005 -0.188 0.410 0.052
## 18 0.000 0.569 0.410 0.000 0.031 0.005 0.026 0.201 0.669
## 19 0.247 0.710 0.488 0.041 0.026 0.005 -0.262 0.292 0.145
## 20 0.205 0.577 0.639 0.036 0.031 0.005 -0.275 -0.081 0.880
## 21 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 22 0.132 0.797 0.267 0.057 0.024 0.005 -0.105 0.726 0.064
## 23 0.141 0.765 0.315 0.051 0.025 0.005 -0.121 0.602 0.065
## 24 0.141 0.765 0.315 0.051 0.025 0.005 -0.121 0.602 0.065
## 25 0.141 0.765 0.315 0.051 0.025 0.005 -0.121 0.602 0.065
## 26 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 27 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
⭐ The data are quite good so we can continue with our analysis
sessionInfo()## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel grid stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] emoji_15.0 vsn_3.64.0
## [3] tximport_1.24.0 forcats_0.5.2
## [5] stringr_1.5.0 dplyr_1.0.10
## [7] purrr_1.0.1 readr_2.1.3
## [9] tidyr_1.2.1 tibble_3.1.8
## [11] tidyverse_1.3.2 pvclust_2.2-0
## [13] plotly_4.10.1 pheatmap_1.0.12
## [15] magrittr_2.0.3 hyperSpec_0.100.0
## [17] xml2_1.3.3 ggplot2_3.4.0
## [19] lattice_0.20-45 here_1.0.1
## [21] gplots_3.1.3 DESeq2_1.36.0
## [23] SummarizedExperiment_1.28.0 Biobase_2.58.0
## [25] MatrixGenerics_1.10.0 matrixStats_0.63.0
## [27] GenomicRanges_1.50.1 GenomeInfoDb_1.34.4
## [29] IRanges_2.32.0 S4Vectors_0.36.1
## [31] BiocGenerics_0.44.0 data.table_1.14.6
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.1 backports_1.4.1 lazyeval_0.2.2
## [4] splines_4.2.0 crosstalk_1.2.0 BiocParallel_1.32.4
## [7] digest_0.6.31 htmltools_0.5.4 fansi_1.0.3
## [10] memoise_2.0.1 googlesheets4_1.0.1 limma_3.52.4
## [13] tzdb_0.3.0 Biostrings_2.66.0 annotate_1.74.0
## [16] modelr_0.1.10 vroom_1.6.0 timechange_0.1.1
## [19] jpeg_0.1-10 colorspace_2.0-3 blob_1.2.3
## [22] rvest_1.0.3 haven_2.5.1 xfun_0.35
## [25] hexbin_1.28.2 crayon_1.5.2 RCurl_1.98-1.9
## [28] jsonlite_1.8.4 genefilter_1.78.0 survival_3.4-0
## [31] glue_1.6.2 gtable_0.3.1 gargle_1.2.1
## [34] zlibbioc_1.44.0 XVector_0.38.0 DelayedArray_0.24.0
## [37] scales_1.2.1 DBI_1.1.3 Rcpp_1.0.9
## [40] viridisLite_0.4.1 xtable_1.8-4 bit_4.0.5
## [43] preprocessCore_1.58.0 htmlwidgets_1.6.1 httr_1.4.4
## [46] RColorBrewer_1.1-3 ellipsis_0.3.2 farver_2.1.1
## [49] pkgconfig_2.0.3 XML_3.99-0.13 sass_0.4.4
## [52] dbplyr_2.2.1 deldir_1.0-6 locfit_1.5-9.7
## [55] utf8_1.2.2 labeling_0.4.2 tidyselect_1.2.0
## [58] rlang_1.0.6 AnnotationDbi_1.60.0 munsell_0.5.0
## [61] cellranger_1.1.0 tools_4.2.0 cachem_1.0.6
## [64] cli_3.6.0 generics_0.1.3 RSQLite_2.2.20
## [67] broom_1.0.2 evaluate_0.19 fastmap_1.1.0
## [70] yaml_2.3.6 knitr_1.41 bit64_4.0.5
## [73] fs_1.5.2 caTools_1.18.2 KEGGREST_1.38.0
## [76] brio_1.1.3 compiler_4.2.0 rstudioapi_0.14
## [79] png_0.1-8 testthat_3.1.6 affyio_1.66.0
## [82] reprex_2.0.2 geneplotter_1.74.0 bslib_0.4.2
## [85] stringi_1.7.12 highr_0.10 Matrix_1.5-3
## [88] vctrs_0.5.1 pillar_1.8.1 lifecycle_1.0.3
## [91] BiocManager_1.30.19 jquerylib_0.1.4 bitops_1.0-7
## [94] R6_2.5.1 latticeExtra_0.6-30 affy_1.74.0
## [97] KernSmooth_2.23-20 codetools_0.2-18 gtools_3.9.4
## [100] assertthat_0.2.1 rprojroot_2.0.3 withr_2.5.0
## [103] GenomeInfoDbData_1.2.9 hms_1.1.2 rmarkdown_2.19
## [106] googledrive_2.0.0 lubridate_1.9.0 interp_1.1-3